The data is accessible from 2017 to today, we used four entire years of data (2017, 2018, 2019, 2020) to match AIS records.
Code
# Set the path to the 2016 folder#path <- "Data/SAR Vessel detections 2017-2020"# List all CSV files in the folder#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)# Read all CSV files and combine them into a single data frame#SAR_fishing <- SAR_csv_files %>%# map_df(~read_csv(.))load(here::here("Data","SAR_fishing.Rdata"))# Aggregate fishing hours by latitude and longitudeaggregated_SAR_fishing <- SAR_fishing %>%mutate(lat_rounded =round(lat, digits =2),lon_rounded =round(lon, digits =2) ) %>%group_by(lat_rounded, lon_rounded) %>%filter(fishing_score >=0.9) %>%summarise(total_presence_score =sum(presence_score, na.rm =TRUE),avg_fishing_score =mean(fishing_score, na.rm =TRUE),count =n() ) %>%mutate(total_presence_score =round(total_presence_score, digits =0)) %>%ungroup()# Standardize and aggregate SAR dataSAR_data_std <- aggregated_SAR_fishing %>%mutate(coords =map2(lon_rounded, lat_rounded, standardize_coords)) %>%mutate(lon_std =map_dbl(coords, ~ .x$lon_std),lat_std =map_dbl(coords, ~ .x$lat_std) ) %>%group_by(lon_std, lat_std) %>%summarise(total_presence_score =sum(total_presence_score, na.rm =TRUE), .groups ="drop")# Create the world mapworld_map <-map_data("world")# Create the plotggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = SAR_data_std, aes(x = lon_std, y = lat_std, fill = total_presence_score)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Fishing vessel detections (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title =NULL,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )
Compare AIS, and SAR detection locations
Identifying grid cells with only AIS, only SAR detections or both data types.
Code
# Merge the datasetscombined_data <-full_join( AIS_data_std, SAR_data_std,by =c("lon_std", "lat_std"))# Categorize each cellcombined_data <- combined_data %>%mutate(category =case_when( total_fishing_hours >0& total_presence_score >0~"Both AIS and SAR", total_fishing_hours >0& (is.na(total_presence_score) | total_presence_score ==0) ~"Only AIS", (is.na(total_fishing_hours) | total_fishing_hours ==0) & total_presence_score >0~"Only SAR",TRUE~"No fishing detected" ))# Create the world mapworld_map <-map_data("world")# Create the plotworld_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data, aes(x = lon_std, y = lat_std, fill = category)) +scale_fill_manual(values =c("Both AIS and SAR"="purple", "Only AIS"="blue", "Only SAR"="red", "No fishing detected"="white"),name ="Fishing Data Source",guide =guide_legend(title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Detection",subtitle ="Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Display the plotprint(world_plot)
Code
# Get summary statisticssummary_stats <- combined_data %>%count(category) %>%mutate(percentage = n /sum(n) *100) %>%rename(`Number of cells`= n) %>%mutate(percentage =round(percentage, 2))# Create and display the tablekable(summary_stats, format ="html", col.names =c("Category", "Number of cells", "Percentage (%)"),caption ="Summary Statistics of Data Categories") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(2, width ="150px") %>%column_spec(3, width ="150px")
Summary Statistics of Data Categories
Category
Number of cells
Percentage (%)
Both AIS and SAR
163095
9.12
Only AIS
1566190
87.60
Only SAR
58668
3.28
Random forest model
Predicting fishing hours in areas with only SAR data at a 0.1 degree resolution. Using a random forest model of >160,000 associations between SAR vessel detections and AIS fishing hours globally, geographical coordinates, distance to ports, distance to shore and bathymetry.
Code
# Open the saved adjusted rastersPorts_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-port-0.1deg-adjusted.tif"))Shore_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-shore-0.1deg-adjusted.tif"))Bathy_adjusted <-raster(here::here("Data", "Environmental data layers", "bathymetry-0.1deg-adjusted.tif"))# Stack the resampled rastersraster_stack <-stack(Shore_adjusted, Ports_adjusted, Bathy_adjusted)# Convert the stack to a dataframeraster_df <-as.data.frame(raster_stack, xy =TRUE)# Rename the columnsnames(raster_df) <-c("x", "y", "dist_shore", "dist_ports", "bathy")# Remove NA values if desiredraster_df <-na.omit(raster_df)# Convert to data.table for efficiencysetDT(raster_df)# Round x and y to 1 decimal place for consistencyraster_df[, `:=`(lon_std =round(x, digits =1),lat_std =round(y, digits =1))]# Keep only ocean areas (negative bathymetry values)raster_df <- raster_df[bathy <0]# Keep the first occurrence of each coordinate pair (because there are duplicates)raster_df <-unique(raster_df, by =c("lon_std", "lat_std"))setDT(raster_df)# Now proceed with the join and model trainingload(here::here("data","combined_data_O1deg.Rdata"))# Keep the first occurrence of each coordinate pair (because there are duplicates)combined_data_01deg <-unique(combined_data_01deg, by =c("lon_std", "lat_std"))setDT(combined_data_01deg)# Perform the join using data.tablecombined_data_with_rasters <- raster_df[combined_data_01deg, on = .(lon_std, lat_std), nomatch =0]# Convert back to dataframe if neededcombined_data_with_rasters <-as.data.frame(combined_data_with_rasters)# Create the new combined dataframecombined_data_all <- raster_df[, .(lon_std, lat_std, dist_shore, dist_ports, bathy)]# Perform the joincombined_data_all <-merge(combined_data_all, combined_data_01deg, by =c("lon_std", "lat_std"), all.x =TRUE)# Fill NA values for has_AIS and has_SAR with FALSEcombined_data_all[is.na(has_AIS), has_AIS :=FALSE]combined_data_all[is.na(has_SAR), has_SAR :=FALSE]# Categorize each cellcombined_data_all <- combined_data_all %>%mutate(category =case_when( has_AIS ==TRUE~"AIS Data", has_SAR ==TRUE~"SAR Data Only",TRUE~"No AIS or SAR Data" ))# Create the world mapworld_map <-map_data("world")# Create the plotworld_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_all, aes(x = lon_std, y = lat_std, fill = category, color = category)) +scale_fill_manual(values =c("AIS Data"="blue", "SAR Data Only"="red", "No AIS or SAR Data"="black"),name ="Data Source",guide =guide_legend(title.position ="top", title.hjust =0.5) ) +scale_color_manual(values =c("AIS Data"="blue", "SAR Data Only"="red", "No AIS or SAR Data"="black"),guide ="none" ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Data Coverage",subtitle ="Distribution of AIS and SAR data at 0.1-degree resolution",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)),panel.grid.major =element_blank(),panel.grid.minor =element_blank(),axis.text =element_text(size =8),axis.title =element_text(size =10),plot.title =element_text(size =14, face ="bold"),plot.subtitle =element_text(size =12) )# Print the plotprint(world_plot)
Code
# Calculate summary statisticssummary_stats <- combined_data_all[, .(data_type =case_when( has_AIS ==TRUE~"AIS Data", has_SAR ==TRUE~"SAR Data Only",TRUE~"No AIS or SAR Data" ))][, .(num_cells = .N,percentage = .N /nrow(combined_data_all) *100), by = data_type]# Order the data typessummary_stats <- summary_stats[order(match(data_type, c("AIS Data", "SAR Data Only", "No AIS or SAR Data")))]# Add total rowtotal_row <-data.table(data_type ="Total",num_cells =sum(summary_stats$num_cells),percentage =100)summary_stats <-rbindlist(list(summary_stats, total_row))# Create kablekable_output <- summary_stats %>%kable(col.names =c("Data Type", "Number of Cells", "Percentage (%)"),digits =c(0, 0, 2),align =c("l", "r", "r"),caption ="Summary Statistics of Data Types" ) %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed"), full_width =FALSE) %>%row_spec(nrow(summary_stats), bold =TRUE, background ="#F0F0F0") %>%footnote(general ="AIS Data category includes cells with both AIS and SAR data.",general_title ="Note:",footnote_as_chunk =TRUE )# Print the kablekable_output
Summary Statistics of Data Types
Data Type
Number of Cells
Percentage (%)
AIS Data
1240429
42.14
SAR Data Only
39915
1.36
No AIS or SAR Data
1663016
56.50
Total
2943360
100.00
Note: AIS Data category includes cells with both AIS and SAR data.
Code
# Prepare the training datatraining_data <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()
Comparison of transformations in models
Code
# Prepare the data#load(here::here("training_data.Rdata"))#training_data_log <- training_data %>%# mutate(# log_total_presence_score = log10(total_presence_score + 1),# log_total_fishing_hours = log10(total_fishing_hours + 1)# )# Function to run a single model#run_model <- function(formula, data) {# randomForest(# formula,# data = data,# ntree = 500,# importance = TRUE# )#}# Set up parallel processing#num_cores <- detectCores() - 1 # Use all but one core#cl <- makeCluster(num_cores)# Export necessary objects to the cluster#clusterExport(cl, c("training_data_log", "run_model"))# Load required packages on each cluster#clusterEvalQ(cl, library(randomForest))# Define the models#models <- list(# no_transform = as.formula(total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy),# original = as.formula(total_fishing_hours ~ log_total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy),# log = as.formula(log_total_fishing_hours ~ log_total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy)#)# Run models in parallel#results <- parLapply(cl, models, function(formula) run_model(formula, training_data_log))# Stop the cluster#stopCluster(cl)# Save the models#rf_model_no_transform <- results[[1]]#rf_model_original <- results[[2]]#rf_model_log <- results[[3]]# Save models to files#saveRDS(rf_model_no_transform, "rf_model_no_transform.rds")#saveRDS(rf_model_original, "rf_model_original.rds")#saveRDS(rf_model_log, "rf_model_log.rds")# Add the new model with only total_fishing_hours log-transformed#rf_model_fishing_log <- randomForest(# log_total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = training_data_log,# ntree = 500,# importance = TRUE#)# Save the new model#saveRDS(rf_model_fishing_log, "rf_model_fishing_log.rds")# Load the saved modelsrf_model_no_transform <-readRDS(here::here("rf_model_no_transform.rds"))rf_model_original <-readRDS(here::here("rf_model_original.rds"))rf_model_log <-readRDS(here::here("rf_model_log.rds"))rf_model_fishing_log <-readRDS(here::here("rf_model_fishing_log.rds"))# Function to evaluate modelsevaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <- data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}# Evaluate all modelsvalidation_data <- combined_data_with_rasters %>%mutate(data_category =case_when( has_AIS & has_SAR ~"Both AIS and SAR", has_AIS &!has_SAR ~"Only AIS",!has_AIS & has_SAR ~"Only SAR",TRUE~"No fishing detected" ) )validation_data <- validation_data %>%filter(data_category =="Both AIS and SAR")# Evaluate all modelsresults_no_transform <-evaluate_model(rf_model_no_transform, validation_data)validation_data_logpres <- validation_data %>%mutate(log_total_presence_score =log10(total_presence_score +1))results_original <-evaluate_model(rf_model_original, validation_data_logpres)# Add evaluation for the new model (fishing hours log-transformed)results_fishing_log <-evaluate_model(rf_model_fishing_log, validation_data, log_target =TRUE)results_log <-evaluate_model(rf_model_log, validation_data_logpres, log_target =TRUE)# Create a data frame with the resultsresults_df <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error", "Median Absolute Error", "R-Squared", "Adjusted R-Squared", "Mean of Residuals", "Standard Deviation of Residuals"),No_Transform =c(results_no_transform$`Mean Absolute Error`, results_no_transform$`Root Mean Squared Error`, results_no_transform$`Mean Absolute Percentage Error`, results_no_transform$`Median Absolute Error`, results_no_transform$`R-Squared`, results_no_transform$`Adjusted R-Squared`, results_no_transform$`Mean of Residuals`, results_no_transform$`Standard Deviation of Residuals`),Fishing_Log =c(results_fishing_log$`Mean Absolute Error`, results_fishing_log$`Root Mean Squared Error`, results_fishing_log$`Mean Absolute Percentage Error`, results_fishing_log$`Median Absolute Error`, results_fishing_log$`R-Squared`, results_fishing_log$`Adjusted R-Squared`, results_fishing_log$`Mean of Residuals`, results_fishing_log$`Standard Deviation of Residuals`),Presence_Log =c(results_original$`Mean Absolute Error`, results_original$`Root Mean Squared Error`, results_original$`Mean Absolute Percentage Error`, results_original$`Median Absolute Error`, results_original$`R-Squared`, results_original$`Adjusted R-Squared`, results_original$`Mean of Residuals`, results_original$`Standard Deviation of Residuals`),Both_Log =c(results_log$`Mean Absolute Error`, results_log$`Root Mean Squared Error`, results_log$`Mean Absolute Percentage Error`, results_log$`Median Absolute Error`, results_log$`R-Squared`, results_log$`Adjusted R-Squared`, results_log$`Mean of Residuals`, results_log$`Standard Deviation of Residuals`))# Create the kablekable(results_df, format ="html", digits =3,col.names =c("Metric", "No Transform", "Fishing Hours Log", "Presence Score Log", "Both Log"),caption ="Model Performance Comparison") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE) %>%add_header_above(c(" "=1, "Models"=4)) %>%column_spec(1, bold =TRUE)
Model Performance Comparison
Models
Metric
No Transform
Fishing Hours Log
Presence Score Log
Both Log
Mean Absolute Error
160.267
186.463
160.912
186.250
Root Mean Squared Error
737.537
1123.120
742.742
1119.788
Mean Absolute Percentage Error
1228.122
71.127
1242.678
71.200
Median Absolute Error
21.973
10.100
22.012
10.104
R-Squared
0.812
0.824
0.808
0.824
Adjusted R-Squared
0.812
0.824
0.808
0.824
Mean of Residuals
-5.937
150.586
-6.438
150.287
Standard Deviation of Residuals
737.516
1112.983
742.718
1109.662
Interpretation of model comparison metrics
Based on the provided performance metrics, I would choose the Fishing Hours Log-Transformed Model. Here’s the reasoning:
R-Squared and Adjusted R-Squared: The Fishing Hours Log model has the highest R-squared (0.8239) and Adjusted R-squared values, indicating it explains the most variance in the data.
Mean Absolute Percentage Error (MAPE): This model has a significantly lower MAPE (69.71%) compared to the No Transform and Presence Log models (both over 1200%). This suggests much better relative accuracy. Median Absolute Error: It has the lowest median absolute error (10.19), which indicates good performance on typical cases.
Root Mean Squared Error (RMSE): While higher than the No Transform model, the difference isn’t as dramatic as the improvement in MAPE.
Mean Absolute Error (MAE): Although higher than No Transform and Presence Log models, this should be considered in context with other metrics.
The Both Log model performs very similarly to the Fishing Hours Log model, but the latter edges it out slightly in most metrics.
The No Transform and Presence Log models, despite having lower MAE and RMSE, have extremely high MAPE values, suggesting they might be making large relative errors, especially on smaller values.
The logarithmic transformation of fishing hours seems to have addressed some issues with the data distribution, leading to more balanced performance across different scales of the target variable.
In conclusion, the Fishing Hours Log-Transformed Model appears to offer the best overall performance, particularly in terms of explained variance and relative error metrics. However, the choice might depend on the specific requirements of your application, such as whether absolute or relative errors are more important in your context.
Selected Model performance
Code
evaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <-if (log_target) 10^data$log_total_fishing_hours -1else data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}validation_data <- combined_data_with_rasters %>%mutate(data_category =case_when( has_AIS & has_SAR ~"Both AIS and SAR", has_AIS &!has_SAR ~"Only AIS",!has_AIS & has_SAR ~"Only SAR",TRUE~"No fishing detected" ),log_total_fishing_hours =log10(total_fishing_hours +1) )# Evaluate the modelrf_model_fishing_log <-readRDS(here::here("rf_model_fishing_log.rds"))validation_data <- validation_data %>%filter(data_category =="Both AIS and SAR")results_rf_fishing_log <-evaluate_model(rf_model_fishing_log, validation_data, log_target =TRUE)# Create a dataframe for the tableresults_table <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error","Median Absolute Error", "R-Squared", "Adjusted R-Squared","Mean of Residuals", "Standard Deviation of Residuals"),Value =round(c(results_rf_fishing_log$`Mean Absolute Error`, results_rf_fishing_log$`Root Mean Squared Error`, results_rf_fishing_log$`Mean Absolute Percentage Error`, results_rf_fishing_log$`Median Absolute Error`, results_rf_fishing_log$`R-Squared`, results_rf_fishing_log$`Adjusted R-Squared`, results_rf_fishing_log$`Mean of Residuals`, results_rf_fishing_log$`Standard Deviation of Residuals`),2) # Round to 2 decimal places)# Create and display the tablekable(results_table, format ="html", digits =4, caption ="Model Evaluation Metrics for Log-Transformed Fishing Hours Model") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center")
Model Evaluation Metrics for Log-Transformed Fishing Hours Model
Metric
Value
Mean Absolute Error
186.46
Root Mean Squared Error
1123.12
Mean Absolute Percentage Error
71.13
Median Absolute Error
10.10
R-Squared
0.82
Adjusted R-Squared
0.82
Mean of Residuals
150.59
Standard Deviation of Residuals
1112.98
Code
# For feature importance, create a separate tablefeature_importance <-as.data.frame(results_rf_fishing_log$`Feature Importance`)feature_importance$Feature <-rownames(feature_importance)feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]colnames(feature_importance) <-c("Feature", "%IncMSE", "IncNodePurity")# Sort the feature importance table by %IncMSE in descending orderfeature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]kable(feature_importance, format ="html", digits =4, col.names =c("Feature", "%IncMSE", "IncNodePurity"),caption ="Feature Importance for Log-Transformed Fishing Hours Model") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:3, width ="150px")
Feature Importance for Log-Transformed Fishing Hours Model
Feature
%IncMSE
IncNodePurity
lat_std
lat_std
298.3858
23760.26
lon_std
lon_std
243.3174
26855.30
bathy
bathy
214.4595
28714.10
total_presence_score
total_presence_score
189.2026
44184.39
dist_shore
dist_shore
136.4095
12774.27
dist_ports
dist_ports
99.1768
14170.55
Maps of predictions
Code
# Prepare the prediction dataprediction_data <- combined_data_with_rasters %>% dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)# Make predictionslog_predictions <-predict(rf_model_fishing_log, newdata = prediction_data)# Back-transform predictionspredictions <-10^log_predictions -1# Add predictions to the original datasetcombined_data_01deg <- combined_data_01deg %>%mutate(predicted_fishing_hours =case_when( has_AIS ~ total_fishing_hours, has_SAR ~ predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],TRUE~0 ) )#Violin plot of observed versus predicted fishing hours # Prepare data for plottingplot_data <- combined_data_01deg %>%mutate(ais_fishing_hours =if_else(has_AIS, total_fishing_hours, NA_real_),sar_predicted_hours =if_else(!has_AIS & has_SAR, predicted_fishing_hours, NA_real_) ) %>% dplyr::select(ais_fishing_hours, sar_predicted_hours) %>%pivot_longer(cols =c(ais_fishing_hours, sar_predicted_hours),names_to ="type",values_to ="hours") %>%filter(!is.na(hours))# Create violin plotviolin_plot <-ggplot(plot_data, aes(x = type, y = hours, fill = type)) +geom_violin(trim =FALSE) +geom_boxplot(width =0.1, fill ="white", color ="black", alpha =0.5, outlier.shape =NA) +scale_y_log10(labels = scales::comma_format(accuracy =1)) +scale_x_discrete(labels =c("ais_fishing_hours"="AIS Data", "sar_predicted_hours"="SAR Predictions")) +labs(title ="Comparison of AIS Fishing Hours and SAR Predicted Fishing Hours",subtitle ="AIS data for AIS-covered areas, Predictions for SAR-only areas",x ="",y ="Fishing Hours (log scale)",fill ="Type") +theme_minimal() +theme(legend.position ="none",axis.text.x =element_text(angle =45, hjust =1))# Print the plotprint(violin_plot)
Code
# Map of predicted fishing hours only # Prepare the data for the mapmap_data <- combined_data_01deg %>%filter(!has_AIS & has_SAR) %>% dplyr::select(lon_std, lat_std, predicted_fishing_hours)# The map_data now contains back-transformed predictions, so no further transformation is needed#predicted_SAR_only_1RF=map_data#save(predicted_SAR_only_1RF, file="predicted_SAR_only_1RF.Rdata")# Create the world mapworld_map <-map_data("world")# Function to create map for a specific regioncreate_region_map <-function(data, world_map, lon_col, lat_col, lon_range, lat_range, title, subtitle) {ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="darkgray", fill ="lightgray", size =0.1) +geom_tile(data = data, aes(x = .data[[lon_col]], y = .data[[lat_col]], fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Predicted fishing hours (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3, xlim = lon_range, ylim = lat_range) +theme_minimal() +labs(title = title,subtitle = subtitle,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )}# Global mappredicted_SAR_only_plot <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution")# Caribbean mapcaribbean_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution")# Asia mapasia_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution")# Print the mapsprint(predicted_SAR_only_plot)
Code
print(caribbean_map)
Code
print(indian_european_map)
Code
print(asia_map)
Code
#Map of both original and predicted AIS fishing hours # Visualize the resultspredicted_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_01deg, aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.1 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )print(predicted_plot)
Code
#Aggregate data to 0.5 degree resolution # First, round the coordinates to the nearest 0.5 degreecombined_data_05deg <- combined_data_01deg %>%mutate(lon_05deg =round(lon_std *2) /2,lat_05deg =round(lat_std *2) /2 )# Aggregate the dataaggregated_data <- combined_data_05deg %>%group_by(lon_05deg, lat_05deg) %>%summarise(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE),.groups ='drop' )#save(aggregated_data, file=here::here("Predicted_Fishing_Hours_05Deg.Rdata"))# Create the mappredicted_plot_05deg <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = aggregated_data, aes(x = lon_05deg, y = lat_05deg, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.5 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Print the mapprint(predicted_plot_05deg)
Code
# Caribbean mapcaribbean_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution")# Asia mapasia_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution")# Print the mapsprint(caribbean_map)
Code
print(indian_european_map)
Code
print(asia_map)
Test for spatial auto-correlation
Code
##------------ Create a non-random test set # Prepare the datadata <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit() %>%mutate(log_total_fishing_hours =log10(total_fishing_hours +1))# Split the data into training and test setsset.seed(123) # for reproducibilitytrain_indices <-sample(1:nrow(data), 0.7*nrow(data))train_data <- data[train_indices, ]test_data <- data[-train_indices, ]# Train the random forest model (commented out as you're loading a saved model)# set.seed(123) # for reproducibility# rf_timing <- system.time({# rf_model_fishing_log_train <- randomForest(# log_total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE# )# })# # cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")# # # Save the new model# save(rf_model_fishing_log_train, file = "rf_model_fishing_log_train.Rdata")# Load the saved model#load(here::here("rf_model_fishing_log_train.Rdata"))rf_model_fishing_log <-readRDS(here::here("rf_model_fishing_log.rds"))# Make predictions on the test settest_data$log_predictions <-predict(rf_model_fishing_log, test_data)# Back-transform predictions and actual valuestest_data$predictions <-10^test_data$log_predictions -1test_data$total_fishing_hours_original <-10^test_data$log_total_fishing_hours -1# Calculate residualstest_data$residuals <- test_data$total_fishing_hours - test_data$predictions# Prepare spatial data for autocorrelation analysistest_data_sf <-st_as_sf(test_data, coords =c("lon_std", "lat_std"), crs =4326)# Create a neighbors listk <-8# You can adjust this numbernb <-knearneigh(st_coordinates(test_data_sf), k = k)nb <-knn2nb(nb)# Create a spatial weights matrixlw <-nb2listw(nb, style="W")# Perform Moran's I test on the test set residualsmoran_test <-moran.test(test_data_sf$residuals, lw)# Create a data frame with the test results moran_results <-data.frame(Statistic =c("Moran I statistic", "Expectation", "Variance", "Standard deviate", "P-value"),Value =c(round(moran_test$estimate[1],2),round(moran_test$estimate[2],2),round(moran_test$estimate[3],2),round(moran_test$statistic,2),ifelse(moran_test$p.value <2.2e-16, "< 2.2e-16", format(moran_test$p.value, scientific =TRUE, digits =4)) ) )# Create and display the tablekable(moran_results, format ="html", col.names =c("Statistic", "Value"),caption ="Moran I Test under randomisation") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE,position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2, width ="15em")
Moran I Test under randomisation
Statistic
Value
Moran I statistic
Moran I statistic
0.26
Expectation
Expectation
0
Variance
Variance
0
Moran I statistic standard deviate
Standard deviate
103.7
P-value
< 2.2e-16
Spatial cross-validation
Code
# Load the training data and the modelload(here::here("training_data.Rdata"))rf_model_fishing_log <-readRDS(here::here("rf_model_fishing_log.rds"))# Convert training_data to sf object if it's not alreadyif (!inherits(training_data, "sf")) { training_data_sf <-st_as_sf(training_data, coords =c("lon_std", "lat_std"), crs =4326)} else { training_data_sf <- training_data}# Perform k-means clusteringset.seed(123) # for reproducibilitycoords <-st_coordinates(training_data_sf)kmeans_result <-kmeans(coords, centers =6)# Add cluster assignments to the sf objecttraining_data_sf$block <-as.factor(kmeans_result$cluster)# Define the evaluate_model functionevaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <-if (log_target) 10^data$log_total_fishing_hours -1else data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}# Predict for each foldresults_list <-list()performance_metrics <-list()for (i in1:6) {# Get the test data for the i-th block test_data <- training_data_sf[training_data_sf$block == i, ]# Extract coordinates coords <-st_coordinates(test_data)# Create a new data frame for prediction test_data_df <-data.table(total_fishing_hours = test_data$total_fishing_hours,log_total_fishing_hours =log10(test_data$total_fishing_hours +1),total_presence_score = test_data$total_presence_score,dist_shore = test_data$dist_shore,dist_ports = test_data$dist_ports,bathy = test_data$bathy,lon_std = coords[,"X"],lat_std = coords[,"Y"] )# Evaluate the model fold_metrics <-evaluate_model(rf_model_fishing_log, test_data_df, log_target =TRUE)# Store performance metrics performance_metrics[[i]] <- fold_metrics# Make predictions for results_list log_predictions <-predict(rf_model_fishing_log, newdata = test_data_df) predictions <-10^log_predictions -1# Store results results_list[[i]] <-data.table(observed = test_data$total_fishing_hours,predicted = predictions,fold = i,lon_std = coords[,"X"],lat_std = coords[,"Y"] )}# Create performance metrics tableperformance_table <-data.frame(Metric =character(), Fold1 =numeric(), Fold2 =numeric(), Fold3 =numeric(), Fold4 =numeric(), Fold5 =numeric(), Fold6 =numeric(),stringsAsFactors =FALSE)for (metric innames(performance_metrics[[1]])) {if (metric !="Feature Importance") { row <-c(metric, sapply(performance_metrics, function(x) x[[metric]])) performance_table <-rbind(performance_table, row) }}# Convert numeric columns to numericperformance_table[, 2:7] <-lapply(performance_table[, 2:7], as.numeric)performance_metrics_names <-c("Mean Absolute Error","Root Mean Squared Error","Mean Absolute Percentage Error","Median Absolute Error","R-Squared","Adjusted R-Squared","Mean of Residuals","Standard Deviation of Residuals")performance_table[,1] <- performance_metrics_namescolnames(performance_table)=c("Metric", "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6")# Round the valuesperformance_table_formatted <- performance_table %>%mutate(across(Fold1:Fold6, ~round(., 2)))# Display the performance tablekable(performance_table_formatted,caption ="Model Performance for Each Fold",align =c('l', rep('r', 6)),format ="html") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE)
Model Performance for Each Fold
Metric
Fold1
Fold2
Fold3
Fold4
Fold5
Fold6
Mean Absolute Error
677.18
110.26
41.84
44.39
158.75
95.34
Root Mean Squared Error
2991.82
557.18
196.19
300.53
675.18
393.05
Mean Absolute Percentage Error
75.78
75.41
80.52
65.87
61.37
86.08
Median Absolute Error
26.05
5.98
5.66
5.12
11.84
9.65
R-Squared
0.82
0.82
0.82
0.82
0.82
0.82
Adjusted R-Squared
0.82
0.82
0.82
0.82
0.82
0.82
Mean of Residuals
548.04
100.93
36.77
37.30
129.92
80.80
Standard Deviation of Residuals
2941.26
547.99
192.72
298.21
662.57
384.67
Predict fishing hours in areas with no fishing data
Random forest model
Code
# Use combined_data_01deg which has the predicted_fishing_hours columntraining_data <- combined_data_01deg %>%left_join(raster_df, by =c("lon_std", "lat_std")) %>% dplyr::select(predicted_fishing_hours, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()# Log-transform the fishing hours (add a small constant to avoid log(0))training_data$log_fishing_hours <-log(training_data$predicted_fishing_hours +1)library(dplyr)# Function to round to nearest 0.5round_to_half <-function(x) {round(x *2) /2}# Aggregate the data to 0.5 degree resolutiontraining_data_05deg <- training_data %>%mutate(lon_05deg =round_to_half(lon_std),lat_05deg =round_to_half(lat_std) ) %>%group_by(lon_05deg, lat_05deg) %>%summarise(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE),dist_shore =mean(dist_shore, na.rm =TRUE),dist_ports =mean(dist_ports, na.rm =TRUE),bathy =mean(bathy, na.rm =TRUE),.groups ='drop' ) %>%mutate(log_fishing_hours =log(predicted_fishing_hours +1) )# Rename columns to match original naming conventiontraining_data_05deg <- training_data_05deg %>%rename(lon_std = lon_05deg, lat_std = lat_05deg)#Split the data into training and test setsset.seed(123) # for reproducibilitysample_index <-sample(1:nrow(training_data_05deg), 0.8*nrow(training_data_05deg))train_data <- training_data_05deg[sample_index, ]test_data <- training_data_05deg[-sample_index, ]#Train the random forest model#library(randomForest)#rf_model_env <- randomForest(# log_fishing_hours ~ lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE#)# Now, save the model as an RDS file#saveRDS(rf_model_env, file = "rf_model_env_global_nodata.rds")rf_model_env <-readRDS("rf_model_env_global_nodata.rds")
Model performance
Code
evaluate_model <-function(model, data, log_target =FALSE, log_column ="log_fishing_hours") { predictions <-predict(model, newdata = data)if (log_target) { predictions <-exp(predictions) -1 } actual <-if (log_target) exp(data[[log_column]]) -1else data$fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}# Evaluate the model on the test setresults_rf_fishing_log <-evaluate_model(rf_model_env, test_data, log_target =TRUE)# Create a dataframe for the metrics tableresults_table <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error","Median Absolute Error", "R-Squared", "Adjusted R-Squared","Mean of Residuals", "Standard Deviation of Residuals"),Value =round(c(results_rf_fishing_log$`Mean Absolute Error`, results_rf_fishing_log$`Root Mean Squared Error`, results_rf_fishing_log$`Mean Absolute Percentage Error`, results_rf_fishing_log$`Median Absolute Error`, results_rf_fishing_log$`R-Squared`, results_rf_fishing_log$`Adjusted R-Squared`, results_rf_fishing_log$`Mean of Residuals`, results_rf_fishing_log$`Standard Deviation of Residuals`),2) # Round to 2 decimal places)# Format the numbers in the results_tableresults_table$Value <-sapply(results_table$Value, function(x) {if (abs(x) <0.01||abs(x) >=1e6) {format(x, scientific =FALSE, big.mark =",") } else {format(round(x, 2), nsmall =2, big.mark =",") }})# Create and display the metrics tablekable(results_table, format ="html", caption ="Model Evaluation Metrics for Log-Transformed Fishing Hours Model", escape =FALSE) %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center")
Model Evaluation Metrics for Log-Transformed Fishing Hours Model
Metric
Value
Mean Absolute Error
729.30
Root Mean Squared Error
9,292.07
Mean Absolute Percentage Error
163.96
Median Absolute Error
31.42
R-Squared
0.76
Adjusted R-Squared
0.76
Mean of Residuals
650.98
Standard Deviation of Residuals
9,269.48
Code
# For feature importance, create a separate tablefeature_importance <-as.data.frame(results_rf_fishing_log$`Feature Importance`)feature_importance$Feature <-rownames(feature_importance)feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]colnames(feature_importance) <-c("Feature", "%IncMSE", "IncNodePurity")# Sort the feature importance table by %IncMSE in descending orderfeature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]# Create and display the feature importance tablekable(feature_importance, format ="html", digits =2, col.names =c("Feature", "%IncMSE", "IncNodePurity"),caption ="Feature Importance for Log-Transformed Fishing Hours Model") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:3, width ="150px")
Feature Importance for Log-Transformed Fishing Hours Model
Feature
%IncMSE
IncNodePurity
lat_std
lat_std
196.88
96300.61
lon_std
lon_std
174.38
79182.68
dist_shore
dist_shore
156.22
45441.73
dist_ports
dist_ports
138.86
54425.31
bathy
bathy
119.70
49331.27
Maps of predictions
Code
# Aggregate data to 0.5 degree resolutionprediction_data_env <- raster_df %>% dplyr::select(lon_std, lat_std, dist_shore, dist_ports, bathy) %>%filter(dist_shore >0) %>%# Only include ocean areasmutate(lon_std =floor(lon_std *2) /2, # Round to nearest 0.5lat_std =floor(lat_std *2) /2# Round to nearest 0.5 ) %>%group_by(lon_std, lat_std) %>%summarise(dist_shore =mean(dist_shore, na.rm =TRUE),dist_ports =mean(dist_ports, na.rm =TRUE),bathy =mean(bathy, na.rm =TRUE) ) %>%na.omit()# Load the random forest modelrf_model_env <-readRDS("rf_model_env_global_nodata.rds")# Make predictions using the modellog_predictions_env <-predict(rf_model_env, newdata = prediction_data_env)# Back-transform predictionspredictions_env <-exp(log_predictions_env) -1# Add predictions to the aggregated dataframeprediction_data_env$predicted_fishing_hours <- predictions_env# Global mapglobal_map <-create_region_map(data = prediction_data_env, world_map = world_map, lon_col ="lon_std", lat_col ="lat_std", lon_range =c(-180, 180),lat_range =c(-90, 90),title ="Global Fishing Hours",subtitle ="0.5 degree resolution, based on AIS data, SAR data and Random Forest predictions using environmental data")# Regional mapscaribbean_map <-create_region_map( prediction_data_env, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.5 degree resolution")indian_european_map <-create_region_map( prediction_data_env, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution")asia_map <-create_region_map( prediction_data_env, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.5 degree resolution")# Print regional mapsprint(global_map)
Code
print(caribbean_map)
Code
print(indian_european_map)
Code
print(asia_map)
Add MPA location as a predictor
Code
# Extract MPA data for each point# Load the MPA binary rastermpa_raster <-raster(here::here("Data/mpa_ALL_binary.tif"))mpa_data <- raster::extract(mpa_raster, training_data_05deg[, c("lon_std", "lat_std")])# Add MPA data to the training datasettraining_data_05deg$mpa_present <- mpa_data# Split the data into training and test setsset.seed(123) # for reproducibilitysample_index <-sample(1:nrow(training_data_05deg), 0.8*nrow(training_data_05deg))train_data <- training_data_05deg[sample_index, ]test_data <- training_data_05deg[-sample_index, ]# Train the random forest model including MPA as a predictor#rf_model_env_mpa <- randomForest(# log_fishing_hours ~ lon_std + lat_std + dist_shore + dist_ports + bathy + mpa_present,# data = train_data,# ntree = 500,# importance = TRUE#)# Save the new model#saveRDS(rf_model_env_mpa, file = here::here("rf_model_env_mpa_global.rds"))
Model performance
Code
evaluate_model <-function(model, data, log_target =FALSE, log_column ="log_fishing_hours") { predictions <-predict(model, newdata = data)if (log_target) { predictions <-exp(predictions) -1 } actual <-if (log_target) exp(data[[log_column]]) -1else data$fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}# Evaluate the model on the test setrf_model_env_mpa <-readRDS(here::here("rf_model_env_mpa_global.rds"))results_rf_fishing_log <-evaluate_model(rf_model_env_mpa, test_data, log_target =TRUE)# Create a dataframe for the metrics tableresults_table <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error","Median Absolute Error", "R-Squared", "Adjusted R-Squared","Mean of Residuals", "Standard Deviation of Residuals"),Value =round(c(results_rf_fishing_log$`Mean Absolute Error`, results_rf_fishing_log$`Root Mean Squared Error`, results_rf_fishing_log$`Mean Absolute Percentage Error`, results_rf_fishing_log$`Median Absolute Error`, results_rf_fishing_log$`R-Squared`, results_rf_fishing_log$`Adjusted R-Squared`, results_rf_fishing_log$`Mean of Residuals`, results_rf_fishing_log$`Standard Deviation of Residuals`),2) # Round to 2 decimal places)# Format the numbers in the results_tableresults_table$Value <-sapply(results_table$Value, function(x) {if (abs(x) <0.01||abs(x) >=1e6) {format(x, scientific =FALSE, big.mark =",") } else {format(round(x, 2), nsmall =2, big.mark =",") }})# Create and display the metrics tablekable(results_table, format ="html", caption ="Model Evaluation Metrics for Log-Transformed Fishing Hours Model with MPA location", escape =FALSE) %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center")
Model Evaluation Metrics for Log-Transformed Fishing Hours Model with MPA location
Metric
Value
Mean Absolute Error
618.03
Root Mean Squared Error
7,980.45
Mean Absolute Percentage Error
135.49
Median Absolute Error
26.86
R-Squared
0.80
Adjusted R-Squared
0.80
Mean of Residuals
515.09
Standard Deviation of Residuals
7,964.01
Code
# For feature importance, create a separate tablefeature_importance <-as.data.frame(results_rf_fishing_log$`Feature Importance`)feature_importance$Feature <-rownames(feature_importance)feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]colnames(feature_importance) <-c("Feature", "%IncMSE", "IncNodePurity")# Sort the feature importance table by %IncMSE in descending orderfeature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]# Create and display the feature importance tablekable(feature_importance, format ="html", digits =2, col.names =c("Feature", "%IncMSE", "IncNodePurity"),caption ="Feature Importance for Log-Transformed Fishing Hours Model with MPA location") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:3, width ="150px")
Feature Importance for Log-Transformed Fishing Hours Model with MPA location
Feature
%IncMSE
IncNodePurity
lat_std
lat_std
339.29
104313.10
lon_std
lon_std
317.89
84089.63
dist_shore
dist_shore
164.96
41929.62
dist_ports
dist_ports
141.52
51094.03
bathy
bathy
121.45
46684.34
mpa_present
mpa_present
51.95
3389.27
Maps of predictions
Code
# Load the MPA binary rastermpa_raster <-raster(here::here("Data/mpa_ALL_binary.tif"))# Aggregate data to 0.5 degree resolutionprediction_data_env <- raster_df %>% dplyr::select(lon_std, lat_std, dist_shore, dist_ports, bathy) %>%filter(dist_shore >0) %>%# Only include ocean areasmutate(lon_std =floor(lon_std *2) /2, # Round to nearest 0.5lat_std =floor(lat_std *2) /2# Round to nearest 0.5 ) %>%group_by(lon_std, lat_std) %>%summarise(dist_shore =mean(dist_shore, na.rm =TRUE),dist_ports =mean(dist_ports, na.rm =TRUE),bathy =mean(bathy, na.rm =TRUE) ) %>%na.omit()# Extract MPA data for each point in the prediction gridmpa_data <- raster::extract(mpa_raster, prediction_data_env[, c("lon_std", "lat_std")])# Add MPA data to the prediction datasetprediction_data_env$mpa_present <- mpa_data# Make sure there are no NA values in mpa_present columnprediction_data_env <- prediction_data_env %>%mutate(mpa_present =ifelse(is.na(mpa_present), 0, mpa_present))# Load the random forest model that includes MPA as a predictorrf_model_env_mpa <-readRDS("rf_model_env_mpa_global.rds")# Make predictions using the model that includes MPAlog_predictions_env_mpa <-predict(rf_model_env_mpa, newdata = prediction_data_env)# Back-transform predictionspredictions_env_mpa <-exp(log_predictions_env_mpa) -1# Add predictions to the aggregated dataframeprediction_data_env$predicted_fishing_hours <- predictions_env_mpasave(prediction_data_env, file=here::here("Data", "predicted_fishing_everywhere_05Deg.Rdata"))# Global mapglobal_map <-create_region_map(data = prediction_data_env, world_map = world_map, lon_col ="lon_std", lat_col ="lat_std", lon_range =c(-180, 180),lat_range =c(-90, 90),title ="Global Fishing Hours",subtitle ="0.5 degree resolution, based on AIS data, SAR data, MPA presence, and Random Forest predictions using environmental data")# Regional mapscaribbean_map <-create_region_map(data = prediction_data_env, world_map = world_map, lon_col ="lon_std", lat_col ="lat_std", lon_range =c(-100, -50), lat_range =c(0, 40), title ="Predicted Fishing Hours in the Caribbean", subtitle ="0.5 degree resolution")indian_european_map <-create_region_map(data = prediction_data_env, world_map = world_map, lon_col ="lon_std", lat_col ="lat_std", lon_range =c(-20, 80), lat_range =c(0, 70), title ="Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", subtitle ="0.5 degree resolution")asia_map <-create_region_map(data = prediction_data_env, world_map = world_map, lon_col ="lon_std", lat_col ="lat_std", lon_range =c(60, 180), lat_range =c(-20, 60), title ="Predicted Fishing Hours in Asia", subtitle ="0.5 degree resolution")# Print mapsprint(global_map)
Code
print(caribbean_map)
Code
print(indian_european_map)
Code
print(asia_map)
Source Code
---title: "Global_fishing_watch_workflow_predict_everywhere"author: "Théophile L. Mouton"date: "`r Sys.Date()`"format: html: toc: true toc-location: right css: custom.css output-file: "Global_Fishing_Watch_workflow_predict_everywhere.html" self-contained: true code-fold: true code-tools: trueeditor: visualexecute: warning: falseparams: printlabels: TRUE---# Combining AIS and Sentinel-1 SAR fishing effort data from Global Fishing Watch## Open R libraries```{r}# Load required librarieslibrary(tidyverse)library(tidyr)library(ggplot2)library(data.table)library(gridExtra)library(maps)library(raster)library(sf)library(viridis)library(scales)library(dplyr)library(randomForest)library(caret)library(pdp)library(knitr)library(kableExtra)library(future)library(spdep)library(ncf)library(blockCV)library(parallel)library(doParallel)library(sp)```## AIS dataData from Kroodsma et al. (2018) Science, accessible at: [Global Fishing Watch Data Download Portal](https://globalfishingwatch.org/data-download/datasets/public-fishing-effort).The data is accessible up to the end of the year 2020, we used four entire years of data (2017, 2018, 2019, 2020) to match SAR data records.```{r}# Set the path to the 2017-2020 folder#path <- "Data/AIS Fishing Effort 2017-2020"# List all CSV files in the folder#AIS_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE, recursive = TRUE)# Read all CSV files and combine them into a single data frame#AIS_fishing <- AIS_csv_files %>%# map_df(~read_csv(.))load(here::here("Data","AIS_fishing.Rdata"))# Aggregate fishing hours by latitude and longitudeaggregated_AIS_fishing <- AIS_fishing %>%group_by(cell_ll_lat, cell_ll_lon) %>%summarise(total_fishing_hours =sum(fishing_hours, na.rm =TRUE)) %>%ungroup() %>%# Remove any cells with zero or negative fishing hoursfilter(total_fishing_hours >0)# Function to standardize coordinates to 0.1 degree resolutionstandardize_coords <-function(lon, lat) {list(lon_std =floor(lon *10) /10,lat_std =floor(lat *10) /10 )}# Standardize and aggregate AIS dataAIS_data_std <- aggregated_AIS_fishing %>%mutate(coords =map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%mutate(lon_std =map_dbl(coords, ~ .x$lon_std),lat_std =map_dbl(coords, ~ .x$lat_std) ) %>%group_by(lon_std, lat_std) %>%summarise(total_fishing_hours =sum(total_fishing_hours, na.rm =TRUE), .groups ="drop")# Create the world mapworld_map <-map_data("world")# Create the plotggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = AIS_data_std, aes(x = lon_std, y = lat_std, fill = total_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title =NULL,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )```## Sentinel-1 SAR dataData from Paolo et al. 2024 Nature, accessible at: [Global Fishing Watch SAR Vessel Detections](https://globalfishingwatch.org/data-download/datasets/public-sar-vessel-detections:v20231026)The data is accessible from 2017 to today, we used four entire years of data (2017, 2018, 2019, 2020) to match AIS records.```{r}# Set the path to the 2016 folder#path <- "Data/SAR Vessel detections 2017-2020"# List all CSV files in the folder#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)# Read all CSV files and combine them into a single data frame#SAR_fishing <- SAR_csv_files %>%# map_df(~read_csv(.))load(here::here("Data","SAR_fishing.Rdata"))# Aggregate fishing hours by latitude and longitudeaggregated_SAR_fishing <- SAR_fishing %>%mutate(lat_rounded =round(lat, digits =2),lon_rounded =round(lon, digits =2) ) %>%group_by(lat_rounded, lon_rounded) %>%filter(fishing_score >=0.9) %>%summarise(total_presence_score =sum(presence_score, na.rm =TRUE),avg_fishing_score =mean(fishing_score, na.rm =TRUE),count =n() ) %>%mutate(total_presence_score =round(total_presence_score, digits =0)) %>%ungroup()# Standardize and aggregate SAR dataSAR_data_std <- aggregated_SAR_fishing %>%mutate(coords =map2(lon_rounded, lat_rounded, standardize_coords)) %>%mutate(lon_std =map_dbl(coords, ~ .x$lon_std),lat_std =map_dbl(coords, ~ .x$lat_std) ) %>%group_by(lon_std, lat_std) %>%summarise(total_presence_score =sum(total_presence_score, na.rm =TRUE), .groups ="drop")# Create the world mapworld_map <-map_data("world")# Create the plotggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = SAR_data_std, aes(x = lon_std, y = lat_std, fill = total_presence_score)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Fishing vessel detections (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title =NULL,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )```## Compare AIS, and SAR detection locationsIdentifying grid cells with only AIS, only SAR detections or both data types.```{r}# Merge the datasetscombined_data <-full_join( AIS_data_std, SAR_data_std,by =c("lon_std", "lat_std"))# Categorize each cellcombined_data <- combined_data %>%mutate(category =case_when( total_fishing_hours >0& total_presence_score >0~"Both AIS and SAR", total_fishing_hours >0& (is.na(total_presence_score) | total_presence_score ==0) ~"Only AIS", (is.na(total_fishing_hours) | total_fishing_hours ==0) & total_presence_score >0~"Only SAR",TRUE~"No fishing detected" ))# Create the world mapworld_map <-map_data("world")# Create the plotworld_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data, aes(x = lon_std, y = lat_std, fill = category)) +scale_fill_manual(values =c("Both AIS and SAR"="purple", "Only AIS"="blue", "Only SAR"="red", "No fishing detected"="white"),name ="Fishing Data Source",guide =guide_legend(title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Detection",subtitle ="Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Display the plotprint(world_plot)# Get summary statisticssummary_stats <- combined_data %>%count(category) %>%mutate(percentage = n /sum(n) *100) %>%rename(`Number of cells`= n) %>%mutate(percentage =round(percentage, 2))# Create and display the tablekable(summary_stats, format ="html", col.names =c("Category", "Number of cells", "Percentage (%)"),caption ="Summary Statistics of Data Categories") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(2, width ="150px") %>%column_spec(3, width ="150px")```## Random forest modelPredicting fishing hours in areas with only SAR data at a 0.1 degree resolution. Using a random forest model of \>160,000 associations between SAR vessel detections and AIS fishing hours globally, geographical coordinates, distance to ports, distance to shore and bathymetry.```{r}# Open the saved adjusted rastersPorts_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-port-0.1deg-adjusted.tif"))Shore_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-shore-0.1deg-adjusted.tif"))Bathy_adjusted <-raster(here::here("Data", "Environmental data layers", "bathymetry-0.1deg-adjusted.tif"))# Stack the resampled rastersraster_stack <-stack(Shore_adjusted, Ports_adjusted, Bathy_adjusted)# Convert the stack to a dataframeraster_df <-as.data.frame(raster_stack, xy =TRUE)# Rename the columnsnames(raster_df) <-c("x", "y", "dist_shore", "dist_ports", "bathy")# Remove NA values if desiredraster_df <-na.omit(raster_df)# Convert to data.table for efficiencysetDT(raster_df)# Round x and y to 1 decimal place for consistencyraster_df[, `:=`(lon_std =round(x, digits =1),lat_std =round(y, digits =1))]# Keep only ocean areas (negative bathymetry values)raster_df <- raster_df[bathy <0]# Keep the first occurrence of each coordinate pair (because there are duplicates)raster_df <-unique(raster_df, by =c("lon_std", "lat_std"))setDT(raster_df)# Now proceed with the join and model trainingload(here::here("data","combined_data_O1deg.Rdata"))# Keep the first occurrence of each coordinate pair (because there are duplicates)combined_data_01deg <-unique(combined_data_01deg, by =c("lon_std", "lat_std"))setDT(combined_data_01deg)# Perform the join using data.tablecombined_data_with_rasters <- raster_df[combined_data_01deg, on = .(lon_std, lat_std), nomatch =0]# Convert back to dataframe if neededcombined_data_with_rasters <-as.data.frame(combined_data_with_rasters)# Create the new combined dataframecombined_data_all <- raster_df[, .(lon_std, lat_std, dist_shore, dist_ports, bathy)]# Perform the joincombined_data_all <-merge(combined_data_all, combined_data_01deg, by =c("lon_std", "lat_std"), all.x =TRUE)# Fill NA values for has_AIS and has_SAR with FALSEcombined_data_all[is.na(has_AIS), has_AIS :=FALSE]combined_data_all[is.na(has_SAR), has_SAR :=FALSE]# Categorize each cellcombined_data_all <- combined_data_all %>%mutate(category =case_when( has_AIS ==TRUE~"AIS Data", has_SAR ==TRUE~"SAR Data Only",TRUE~"No AIS or SAR Data" ))# Create the world mapworld_map <-map_data("world")# Create the plotworld_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_all, aes(x = lon_std, y = lat_std, fill = category, color = category)) +scale_fill_manual(values =c("AIS Data"="blue", "SAR Data Only"="red", "No AIS or SAR Data"="black"),name ="Data Source",guide =guide_legend(title.position ="top", title.hjust =0.5) ) +scale_color_manual(values =c("AIS Data"="blue", "SAR Data Only"="red", "No AIS or SAR Data"="black"),guide ="none" ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Data Coverage",subtitle ="Distribution of AIS and SAR data at 0.1-degree resolution",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)),panel.grid.major =element_blank(),panel.grid.minor =element_blank(),axis.text =element_text(size =8),axis.title =element_text(size =10),plot.title =element_text(size =14, face ="bold"),plot.subtitle =element_text(size =12) )# Print the plotprint(world_plot)# Calculate summary statisticssummary_stats <- combined_data_all[, .(data_type =case_when( has_AIS ==TRUE~"AIS Data", has_SAR ==TRUE~"SAR Data Only",TRUE~"No AIS or SAR Data" ))][, .(num_cells = .N,percentage = .N /nrow(combined_data_all) *100), by = data_type]# Order the data typessummary_stats <- summary_stats[order(match(data_type, c("AIS Data", "SAR Data Only", "No AIS or SAR Data")))]# Add total rowtotal_row <-data.table(data_type ="Total",num_cells =sum(summary_stats$num_cells),percentage =100)summary_stats <-rbindlist(list(summary_stats, total_row))# Create kablekable_output <- summary_stats %>%kable(col.names =c("Data Type", "Number of Cells", "Percentage (%)"),digits =c(0, 0, 2),align =c("l", "r", "r"),caption ="Summary Statistics of Data Types" ) %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed"), full_width =FALSE) %>%row_spec(nrow(summary_stats), bold =TRUE, background ="#F0F0F0") %>%footnote(general ="AIS Data category includes cells with both AIS and SAR data.",general_title ="Note:",footnote_as_chunk =TRUE )# Print the kablekable_output# Prepare the training datatraining_data <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()```### Comparison of transformations in models ```{r}# Prepare the data#load(here::here("training_data.Rdata"))#training_data_log <- training_data %>%# mutate(# log_total_presence_score = log10(total_presence_score + 1),# log_total_fishing_hours = log10(total_fishing_hours + 1)# )# Function to run a single model#run_model <- function(formula, data) {# randomForest(# formula,# data = data,# ntree = 500,# importance = TRUE# )#}# Set up parallel processing#num_cores <- detectCores() - 1 # Use all but one core#cl <- makeCluster(num_cores)# Export necessary objects to the cluster#clusterExport(cl, c("training_data_log", "run_model"))# Load required packages on each cluster#clusterEvalQ(cl, library(randomForest))# Define the models#models <- list(# no_transform = as.formula(total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy),# original = as.formula(total_fishing_hours ~ log_total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy),# log = as.formula(log_total_fishing_hours ~ log_total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy)#)# Run models in parallel#results <- parLapply(cl, models, function(formula) run_model(formula, training_data_log))# Stop the cluster#stopCluster(cl)# Save the models#rf_model_no_transform <- results[[1]]#rf_model_original <- results[[2]]#rf_model_log <- results[[3]]# Save models to files#saveRDS(rf_model_no_transform, "rf_model_no_transform.rds")#saveRDS(rf_model_original, "rf_model_original.rds")#saveRDS(rf_model_log, "rf_model_log.rds")# Add the new model with only total_fishing_hours log-transformed#rf_model_fishing_log <- randomForest(# log_total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = training_data_log,# ntree = 500,# importance = TRUE#)# Save the new model#saveRDS(rf_model_fishing_log, "rf_model_fishing_log.rds")# Load the saved modelsrf_model_no_transform <-readRDS(here::here("rf_model_no_transform.rds"))rf_model_original <-readRDS(here::here("rf_model_original.rds"))rf_model_log <-readRDS(here::here("rf_model_log.rds"))rf_model_fishing_log <-readRDS(here::here("rf_model_fishing_log.rds"))# Function to evaluate modelsevaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <- data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}# Evaluate all modelsvalidation_data <- combined_data_with_rasters %>%mutate(data_category =case_when( has_AIS & has_SAR ~"Both AIS and SAR", has_AIS &!has_SAR ~"Only AIS",!has_AIS & has_SAR ~"Only SAR",TRUE~"No fishing detected" ) )validation_data <- validation_data %>%filter(data_category =="Both AIS and SAR")# Evaluate all modelsresults_no_transform <-evaluate_model(rf_model_no_transform, validation_data)validation_data_logpres <- validation_data %>%mutate(log_total_presence_score =log10(total_presence_score +1))results_original <-evaluate_model(rf_model_original, validation_data_logpres)# Add evaluation for the new model (fishing hours log-transformed)results_fishing_log <-evaluate_model(rf_model_fishing_log, validation_data, log_target =TRUE)results_log <-evaluate_model(rf_model_log, validation_data_logpres, log_target =TRUE)# Create a data frame with the resultsresults_df <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error", "Median Absolute Error", "R-Squared", "Adjusted R-Squared", "Mean of Residuals", "Standard Deviation of Residuals"),No_Transform =c(results_no_transform$`Mean Absolute Error`, results_no_transform$`Root Mean Squared Error`, results_no_transform$`Mean Absolute Percentage Error`, results_no_transform$`Median Absolute Error`, results_no_transform$`R-Squared`, results_no_transform$`Adjusted R-Squared`, results_no_transform$`Mean of Residuals`, results_no_transform$`Standard Deviation of Residuals`),Fishing_Log =c(results_fishing_log$`Mean Absolute Error`, results_fishing_log$`Root Mean Squared Error`, results_fishing_log$`Mean Absolute Percentage Error`, results_fishing_log$`Median Absolute Error`, results_fishing_log$`R-Squared`, results_fishing_log$`Adjusted R-Squared`, results_fishing_log$`Mean of Residuals`, results_fishing_log$`Standard Deviation of Residuals`),Presence_Log =c(results_original$`Mean Absolute Error`, results_original$`Root Mean Squared Error`, results_original$`Mean Absolute Percentage Error`, results_original$`Median Absolute Error`, results_original$`R-Squared`, results_original$`Adjusted R-Squared`, results_original$`Mean of Residuals`, results_original$`Standard Deviation of Residuals`),Both_Log =c(results_log$`Mean Absolute Error`, results_log$`Root Mean Squared Error`, results_log$`Mean Absolute Percentage Error`, results_log$`Median Absolute Error`, results_log$`R-Squared`, results_log$`Adjusted R-Squared`, results_log$`Mean of Residuals`, results_log$`Standard Deviation of Residuals`))# Create the kablekable(results_df, format ="html", digits =3,col.names =c("Metric", "No Transform", "Fishing Hours Log", "Presence Score Log", "Both Log"),caption ="Model Performance Comparison") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE) %>%add_header_above(c(" "=1, "Models"=4)) %>%column_spec(1, bold =TRUE)```#### Interpretation of model comparison metrics Based on the provided performance metrics, I would choose the Fishing Hours Log-Transformed Model. Here's the reasoning:1. R-Squared and Adjusted R-Squared: The Fishing Hours Log model has the highest R-squared (0.8239) and Adjusted R-squared values, indicating it explains the most variance in the data.2. Mean Absolute Percentage Error (MAPE): This model has a significantly lower MAPE (69.71%) compared to the No Transform and Presence Log models (both over 1200%). This suggests much better relative accuracy.Median Absolute Error: It has the lowest median absolute error (10.19), which indicates good performance on typical cases.3. Root Mean Squared Error (RMSE): While higher than the No Transform model, the difference isn't as dramatic as the improvement in MAPE.4. Mean Absolute Error (MAE): Although higher than No Transform and Presence Log models, this should be considered in context with other metrics.The Both Log model performs very similarly to the Fishing Hours Log model, but the latter edges it out slightly in most metrics.The No Transform and Presence Log models, despite having lower MAE and RMSE, have extremely high MAPE values, suggesting they might be making large relative errors, especially on smaller values.The logarithmic transformation of fishing hours seems to have addressed some issues with the data distribution, leading to more balanced performance across different scales of the target variable.In conclusion, the Fishing Hours Log-Transformed Model appears to offer the best overall performance, particularly in terms of explained variance and relative error metrics. However, the choice might depend on the specific requirements of your application, such as whether absolute or relative errors are more important in your context.### Selected Model performance```{r}evaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <-if (log_target) 10^data$log_total_fishing_hours -1else data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}validation_data <- combined_data_with_rasters %>%mutate(data_category =case_when( has_AIS & has_SAR ~"Both AIS and SAR", has_AIS &!has_SAR ~"Only AIS",!has_AIS & has_SAR ~"Only SAR",TRUE~"No fishing detected" ),log_total_fishing_hours =log10(total_fishing_hours +1) )# Evaluate the modelrf_model_fishing_log <-readRDS(here::here("rf_model_fishing_log.rds"))validation_data <- validation_data %>%filter(data_category =="Both AIS and SAR")results_rf_fishing_log <-evaluate_model(rf_model_fishing_log, validation_data, log_target =TRUE)# Create a dataframe for the tableresults_table <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error","Median Absolute Error", "R-Squared", "Adjusted R-Squared","Mean of Residuals", "Standard Deviation of Residuals"),Value =round(c(results_rf_fishing_log$`Mean Absolute Error`, results_rf_fishing_log$`Root Mean Squared Error`, results_rf_fishing_log$`Mean Absolute Percentage Error`, results_rf_fishing_log$`Median Absolute Error`, results_rf_fishing_log$`R-Squared`, results_rf_fishing_log$`Adjusted R-Squared`, results_rf_fishing_log$`Mean of Residuals`, results_rf_fishing_log$`Standard Deviation of Residuals`),2) # Round to 2 decimal places)# Create and display the tablekable(results_table, format ="html", digits =4, caption ="Model Evaluation Metrics for Log-Transformed Fishing Hours Model") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center")# For feature importance, create a separate tablefeature_importance <-as.data.frame(results_rf_fishing_log$`Feature Importance`)feature_importance$Feature <-rownames(feature_importance)feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]colnames(feature_importance) <-c("Feature", "%IncMSE", "IncNodePurity")# Sort the feature importance table by %IncMSE in descending orderfeature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]kable(feature_importance, format ="html", digits =4, col.names =c("Feature", "%IncMSE", "IncNodePurity"),caption ="Feature Importance for Log-Transformed Fishing Hours Model") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:3, width ="150px")```### Maps of predictions ```{r}# Prepare the prediction dataprediction_data <- combined_data_with_rasters %>% dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)# Make predictionslog_predictions <-predict(rf_model_fishing_log, newdata = prediction_data)# Back-transform predictionspredictions <-10^log_predictions -1# Add predictions to the original datasetcombined_data_01deg <- combined_data_01deg %>%mutate(predicted_fishing_hours =case_when( has_AIS ~ total_fishing_hours, has_SAR ~ predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],TRUE~0 ) )#Violin plot of observed versus predicted fishing hours # Prepare data for plottingplot_data <- combined_data_01deg %>%mutate(ais_fishing_hours =if_else(has_AIS, total_fishing_hours, NA_real_),sar_predicted_hours =if_else(!has_AIS & has_SAR, predicted_fishing_hours, NA_real_) ) %>% dplyr::select(ais_fishing_hours, sar_predicted_hours) %>%pivot_longer(cols =c(ais_fishing_hours, sar_predicted_hours),names_to ="type",values_to ="hours") %>%filter(!is.na(hours))# Create violin plotviolin_plot <-ggplot(plot_data, aes(x = type, y = hours, fill = type)) +geom_violin(trim =FALSE) +geom_boxplot(width =0.1, fill ="white", color ="black", alpha =0.5, outlier.shape =NA) +scale_y_log10(labels = scales::comma_format(accuracy =1)) +scale_x_discrete(labels =c("ais_fishing_hours"="AIS Data", "sar_predicted_hours"="SAR Predictions")) +labs(title ="Comparison of AIS Fishing Hours and SAR Predicted Fishing Hours",subtitle ="AIS data for AIS-covered areas, Predictions for SAR-only areas",x ="",y ="Fishing Hours (log scale)",fill ="Type") +theme_minimal() +theme(legend.position ="none",axis.text.x =element_text(angle =45, hjust =1))# Print the plotprint(violin_plot)# Map of predicted fishing hours only # Prepare the data for the mapmap_data <- combined_data_01deg %>%filter(!has_AIS & has_SAR) %>% dplyr::select(lon_std, lat_std, predicted_fishing_hours)# The map_data now contains back-transformed predictions, so no further transformation is needed#predicted_SAR_only_1RF=map_data#save(predicted_SAR_only_1RF, file="predicted_SAR_only_1RF.Rdata")# Create the world mapworld_map <-map_data("world")# Function to create map for a specific regioncreate_region_map <-function(data, world_map, lon_col, lat_col, lon_range, lat_range, title, subtitle) {ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="darkgray", fill ="lightgray", size =0.1) +geom_tile(data = data, aes(x = .data[[lon_col]], y = .data[[lat_col]], fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Predicted fishing hours (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3, xlim = lon_range, ylim = lat_range) +theme_minimal() +labs(title = title,subtitle = subtitle,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )}# Global mappredicted_SAR_only_plot <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution")# Caribbean mapcaribbean_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution")# Asia mapasia_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution")# Print the mapsprint(predicted_SAR_only_plot)print(caribbean_map)print(indian_european_map)print(asia_map)#Map of both original and predicted AIS fishing hours # Visualize the resultspredicted_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_01deg, aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.1 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )print(predicted_plot)#Aggregate data to 0.5 degree resolution # First, round the coordinates to the nearest 0.5 degreecombined_data_05deg <- combined_data_01deg %>%mutate(lon_05deg =round(lon_std *2) /2,lat_05deg =round(lat_std *2) /2 )# Aggregate the dataaggregated_data <- combined_data_05deg %>%group_by(lon_05deg, lat_05deg) %>%summarise(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE),.groups ='drop' )#save(aggregated_data, file=here::here("Predicted_Fishing_Hours_05Deg.Rdata"))# Create the mappredicted_plot_05deg <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = aggregated_data, aes(x = lon_05deg, y = lat_05deg, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.5 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Print the mapprint(predicted_plot_05deg)# Caribbean mapcaribbean_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution")# Asia mapasia_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution")# Print the mapsprint(caribbean_map)print(indian_european_map)print(asia_map)```### Test for spatial auto-correlation ```{r}##------------ Create a non-random test set # Prepare the datadata <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit() %>%mutate(log_total_fishing_hours =log10(total_fishing_hours +1))# Split the data into training and test setsset.seed(123) # for reproducibilitytrain_indices <-sample(1:nrow(data), 0.7*nrow(data))train_data <- data[train_indices, ]test_data <- data[-train_indices, ]# Train the random forest model (commented out as you're loading a saved model)# set.seed(123) # for reproducibility# rf_timing <- system.time({# rf_model_fishing_log_train <- randomForest(# log_total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE# )# })# # cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")# # # Save the new model# save(rf_model_fishing_log_train, file = "rf_model_fishing_log_train.Rdata")# Load the saved model#load(here::here("rf_model_fishing_log_train.Rdata"))rf_model_fishing_log <-readRDS(here::here("rf_model_fishing_log.rds"))# Make predictions on the test settest_data$log_predictions <-predict(rf_model_fishing_log, test_data)# Back-transform predictions and actual valuestest_data$predictions <-10^test_data$log_predictions -1test_data$total_fishing_hours_original <-10^test_data$log_total_fishing_hours -1# Calculate residualstest_data$residuals <- test_data$total_fishing_hours - test_data$predictions# Prepare spatial data for autocorrelation analysistest_data_sf <-st_as_sf(test_data, coords =c("lon_std", "lat_std"), crs =4326)# Create a neighbors listk <-8# You can adjust this numbernb <-knearneigh(st_coordinates(test_data_sf), k = k)nb <-knn2nb(nb)# Create a spatial weights matrixlw <-nb2listw(nb, style="W")# Perform Moran's I test on the test set residualsmoran_test <-moran.test(test_data_sf$residuals, lw)# Create a data frame with the test results moran_results <-data.frame(Statistic =c("Moran I statistic", "Expectation", "Variance", "Standard deviate", "P-value"),Value =c(round(moran_test$estimate[1],2),round(moran_test$estimate[2],2),round(moran_test$estimate[3],2),round(moran_test$statistic,2),ifelse(moran_test$p.value <2.2e-16, "< 2.2e-16", format(moran_test$p.value, scientific =TRUE, digits =4)) ) )# Create and display the tablekable(moran_results, format ="html", col.names =c("Statistic", "Value"),caption ="Moran I Test under randomisation") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE,position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2, width ="15em")```### Spatial cross-validation ```{r}# Load the training data and the modelload(here::here("training_data.Rdata"))rf_model_fishing_log <-readRDS(here::here("rf_model_fishing_log.rds"))# Convert training_data to sf object if it's not alreadyif (!inherits(training_data, "sf")) { training_data_sf <-st_as_sf(training_data, coords =c("lon_std", "lat_std"), crs =4326)} else { training_data_sf <- training_data}# Perform k-means clusteringset.seed(123) # for reproducibilitycoords <-st_coordinates(training_data_sf)kmeans_result <-kmeans(coords, centers =6)# Add cluster assignments to the sf objecttraining_data_sf$block <-as.factor(kmeans_result$cluster)# Define the evaluate_model functionevaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <-if (log_target) 10^data$log_total_fishing_hours -1else data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}# Predict for each foldresults_list <-list()performance_metrics <-list()for (i in1:6) {# Get the test data for the i-th block test_data <- training_data_sf[training_data_sf$block == i, ]# Extract coordinates coords <-st_coordinates(test_data)# Create a new data frame for prediction test_data_df <-data.table(total_fishing_hours = test_data$total_fishing_hours,log_total_fishing_hours =log10(test_data$total_fishing_hours +1),total_presence_score = test_data$total_presence_score,dist_shore = test_data$dist_shore,dist_ports = test_data$dist_ports,bathy = test_data$bathy,lon_std = coords[,"X"],lat_std = coords[,"Y"] )# Evaluate the model fold_metrics <-evaluate_model(rf_model_fishing_log, test_data_df, log_target =TRUE)# Store performance metrics performance_metrics[[i]] <- fold_metrics# Make predictions for results_list log_predictions <-predict(rf_model_fishing_log, newdata = test_data_df) predictions <-10^log_predictions -1# Store results results_list[[i]] <-data.table(observed = test_data$total_fishing_hours,predicted = predictions,fold = i,lon_std = coords[,"X"],lat_std = coords[,"Y"] )}# Create performance metrics tableperformance_table <-data.frame(Metric =character(), Fold1 =numeric(), Fold2 =numeric(), Fold3 =numeric(), Fold4 =numeric(), Fold5 =numeric(), Fold6 =numeric(),stringsAsFactors =FALSE)for (metric innames(performance_metrics[[1]])) {if (metric !="Feature Importance") { row <-c(metric, sapply(performance_metrics, function(x) x[[metric]])) performance_table <-rbind(performance_table, row) }}# Convert numeric columns to numericperformance_table[, 2:7] <-lapply(performance_table[, 2:7], as.numeric)performance_metrics_names <-c("Mean Absolute Error","Root Mean Squared Error","Mean Absolute Percentage Error","Median Absolute Error","R-Squared","Adjusted R-Squared","Mean of Residuals","Standard Deviation of Residuals")performance_table[,1] <- performance_metrics_namescolnames(performance_table)=c("Metric", "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6")# Round the valuesperformance_table_formatted <- performance_table %>%mutate(across(Fold1:Fold6, ~round(., 2)))# Display the performance tablekable(performance_table_formatted,caption ="Model Performance for Each Fold",align =c('l', rep('r', 6)),format ="html") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE)```## Predict fishing hours in areas with no fishing data ### Random forest model```{r}# Use combined_data_01deg which has the predicted_fishing_hours columntraining_data <- combined_data_01deg %>%left_join(raster_df, by =c("lon_std", "lat_std")) %>% dplyr::select(predicted_fishing_hours, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()# Log-transform the fishing hours (add a small constant to avoid log(0))training_data$log_fishing_hours <-log(training_data$predicted_fishing_hours +1)library(dplyr)# Function to round to nearest 0.5round_to_half <-function(x) {round(x *2) /2}# Aggregate the data to 0.5 degree resolutiontraining_data_05deg <- training_data %>%mutate(lon_05deg =round_to_half(lon_std),lat_05deg =round_to_half(lat_std) ) %>%group_by(lon_05deg, lat_05deg) %>%summarise(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE),dist_shore =mean(dist_shore, na.rm =TRUE),dist_ports =mean(dist_ports, na.rm =TRUE),bathy =mean(bathy, na.rm =TRUE),.groups ='drop' ) %>%mutate(log_fishing_hours =log(predicted_fishing_hours +1) )# Rename columns to match original naming conventiontraining_data_05deg <- training_data_05deg %>%rename(lon_std = lon_05deg, lat_std = lat_05deg)#Split the data into training and test setsset.seed(123) # for reproducibilitysample_index <-sample(1:nrow(training_data_05deg), 0.8*nrow(training_data_05deg))train_data <- training_data_05deg[sample_index, ]test_data <- training_data_05deg[-sample_index, ]#Train the random forest model#library(randomForest)#rf_model_env <- randomForest(# log_fishing_hours ~ lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE#)# Now, save the model as an RDS file#saveRDS(rf_model_env, file = "rf_model_env_global_nodata.rds")rf_model_env <-readRDS("rf_model_env_global_nodata.rds")```### Model performance```{r}evaluate_model <-function(model, data, log_target =FALSE, log_column ="log_fishing_hours") { predictions <-predict(model, newdata = data)if (log_target) { predictions <-exp(predictions) -1 } actual <-if (log_target) exp(data[[log_column]]) -1else data$fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}# Evaluate the model on the test setresults_rf_fishing_log <-evaluate_model(rf_model_env, test_data, log_target =TRUE)# Create a dataframe for the metrics tableresults_table <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error","Median Absolute Error", "R-Squared", "Adjusted R-Squared","Mean of Residuals", "Standard Deviation of Residuals"),Value =round(c(results_rf_fishing_log$`Mean Absolute Error`, results_rf_fishing_log$`Root Mean Squared Error`, results_rf_fishing_log$`Mean Absolute Percentage Error`, results_rf_fishing_log$`Median Absolute Error`, results_rf_fishing_log$`R-Squared`, results_rf_fishing_log$`Adjusted R-Squared`, results_rf_fishing_log$`Mean of Residuals`, results_rf_fishing_log$`Standard Deviation of Residuals`),2) # Round to 2 decimal places)# Format the numbers in the results_tableresults_table$Value <-sapply(results_table$Value, function(x) {if (abs(x) <0.01||abs(x) >=1e6) {format(x, scientific =FALSE, big.mark =",") } else {format(round(x, 2), nsmall =2, big.mark =",") }})# Create and display the metrics tablekable(results_table, format ="html", caption ="Model Evaluation Metrics for Log-Transformed Fishing Hours Model", escape =FALSE) %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center")# For feature importance, create a separate tablefeature_importance <-as.data.frame(results_rf_fishing_log$`Feature Importance`)feature_importance$Feature <-rownames(feature_importance)feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]colnames(feature_importance) <-c("Feature", "%IncMSE", "IncNodePurity")# Sort the feature importance table by %IncMSE in descending orderfeature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]# Create and display the feature importance tablekable(feature_importance, format ="html", digits =2, col.names =c("Feature", "%IncMSE", "IncNodePurity"),caption ="Feature Importance for Log-Transformed Fishing Hours Model") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:3, width ="150px")```### Maps of predictions ```{r}# Aggregate data to 0.5 degree resolutionprediction_data_env <- raster_df %>% dplyr::select(lon_std, lat_std, dist_shore, dist_ports, bathy) %>%filter(dist_shore >0) %>%# Only include ocean areasmutate(lon_std =floor(lon_std *2) /2, # Round to nearest 0.5lat_std =floor(lat_std *2) /2# Round to nearest 0.5 ) %>%group_by(lon_std, lat_std) %>%summarise(dist_shore =mean(dist_shore, na.rm =TRUE),dist_ports =mean(dist_ports, na.rm =TRUE),bathy =mean(bathy, na.rm =TRUE) ) %>%na.omit()# Load the random forest modelrf_model_env <-readRDS("rf_model_env_global_nodata.rds")# Make predictions using the modellog_predictions_env <-predict(rf_model_env, newdata = prediction_data_env)# Back-transform predictionspredictions_env <-exp(log_predictions_env) -1# Add predictions to the aggregated dataframeprediction_data_env$predicted_fishing_hours <- predictions_env# Global mapglobal_map <-create_region_map(data = prediction_data_env, world_map = world_map, lon_col ="lon_std", lat_col ="lat_std", lon_range =c(-180, 180),lat_range =c(-90, 90),title ="Global Fishing Hours",subtitle ="0.5 degree resolution, based on AIS data, SAR data and Random Forest predictions using environmental data")# Regional mapscaribbean_map <-create_region_map( prediction_data_env, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.5 degree resolution")indian_european_map <-create_region_map( prediction_data_env, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution")asia_map <-create_region_map( prediction_data_env, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.5 degree resolution")# Print regional mapsprint(global_map)print(caribbean_map)print(indian_european_map)print(asia_map)```### Add MPA location as a predictor ```{r}# Extract MPA data for each point# Load the MPA binary rastermpa_raster <-raster(here::here("Data/mpa_ALL_binary.tif"))mpa_data <- raster::extract(mpa_raster, training_data_05deg[, c("lon_std", "lat_std")])# Add MPA data to the training datasettraining_data_05deg$mpa_present <- mpa_data# Split the data into training and test setsset.seed(123) # for reproducibilitysample_index <-sample(1:nrow(training_data_05deg), 0.8*nrow(training_data_05deg))train_data <- training_data_05deg[sample_index, ]test_data <- training_data_05deg[-sample_index, ]# Train the random forest model including MPA as a predictor#rf_model_env_mpa <- randomForest(# log_fishing_hours ~ lon_std + lat_std + dist_shore + dist_ports + bathy + mpa_present,# data = train_data,# ntree = 500,# importance = TRUE#)# Save the new model#saveRDS(rf_model_env_mpa, file = here::here("rf_model_env_mpa_global.rds"))```### Model performance```{r}evaluate_model <-function(model, data, log_target =FALSE, log_column ="log_fishing_hours") { predictions <-predict(model, newdata = data)if (log_target) { predictions <-exp(predictions) -1 } actual <-if (log_target) exp(data[[log_column]]) -1else data$fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}# Evaluate the model on the test setrf_model_env_mpa <-readRDS(here::here("rf_model_env_mpa_global.rds"))results_rf_fishing_log <-evaluate_model(rf_model_env_mpa, test_data, log_target =TRUE)# Create a dataframe for the metrics tableresults_table <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error","Median Absolute Error", "R-Squared", "Adjusted R-Squared","Mean of Residuals", "Standard Deviation of Residuals"),Value =round(c(results_rf_fishing_log$`Mean Absolute Error`, results_rf_fishing_log$`Root Mean Squared Error`, results_rf_fishing_log$`Mean Absolute Percentage Error`, results_rf_fishing_log$`Median Absolute Error`, results_rf_fishing_log$`R-Squared`, results_rf_fishing_log$`Adjusted R-Squared`, results_rf_fishing_log$`Mean of Residuals`, results_rf_fishing_log$`Standard Deviation of Residuals`),2) # Round to 2 decimal places)# Format the numbers in the results_tableresults_table$Value <-sapply(results_table$Value, function(x) {if (abs(x) <0.01||abs(x) >=1e6) {format(x, scientific =FALSE, big.mark =",") } else {format(round(x, 2), nsmall =2, big.mark =",") }})# Create and display the metrics tablekable(results_table, format ="html", caption ="Model Evaluation Metrics for Log-Transformed Fishing Hours Model with MPA location", escape =FALSE) %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center")# For feature importance, create a separate tablefeature_importance <-as.data.frame(results_rf_fishing_log$`Feature Importance`)feature_importance$Feature <-rownames(feature_importance)feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]colnames(feature_importance) <-c("Feature", "%IncMSE", "IncNodePurity")# Sort the feature importance table by %IncMSE in descending orderfeature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]# Create and display the feature importance tablekable(feature_importance, format ="html", digits =2, col.names =c("Feature", "%IncMSE", "IncNodePurity"),caption ="Feature Importance for Log-Transformed Fishing Hours Model with MPA location") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:3, width ="150px")```### Maps of predictions ```{r}# Load the MPA binary rastermpa_raster <-raster(here::here("Data/mpa_ALL_binary.tif"))# Aggregate data to 0.5 degree resolutionprediction_data_env <- raster_df %>% dplyr::select(lon_std, lat_std, dist_shore, dist_ports, bathy) %>%filter(dist_shore >0) %>%# Only include ocean areasmutate(lon_std =floor(lon_std *2) /2, # Round to nearest 0.5lat_std =floor(lat_std *2) /2# Round to nearest 0.5 ) %>%group_by(lon_std, lat_std) %>%summarise(dist_shore =mean(dist_shore, na.rm =TRUE),dist_ports =mean(dist_ports, na.rm =TRUE),bathy =mean(bathy, na.rm =TRUE) ) %>%na.omit()# Extract MPA data for each point in the prediction gridmpa_data <- raster::extract(mpa_raster, prediction_data_env[, c("lon_std", "lat_std")])# Add MPA data to the prediction datasetprediction_data_env$mpa_present <- mpa_data# Make sure there are no NA values in mpa_present columnprediction_data_env <- prediction_data_env %>%mutate(mpa_present =ifelse(is.na(mpa_present), 0, mpa_present))# Load the random forest model that includes MPA as a predictorrf_model_env_mpa <-readRDS("rf_model_env_mpa_global.rds")# Make predictions using the model that includes MPAlog_predictions_env_mpa <-predict(rf_model_env_mpa, newdata = prediction_data_env)# Back-transform predictionspredictions_env_mpa <-exp(log_predictions_env_mpa) -1# Add predictions to the aggregated dataframeprediction_data_env$predicted_fishing_hours <- predictions_env_mpasave(prediction_data_env, file=here::here("Data", "predicted_fishing_everywhere_05Deg.Rdata"))# Global mapglobal_map <-create_region_map(data = prediction_data_env, world_map = world_map, lon_col ="lon_std", lat_col ="lat_std", lon_range =c(-180, 180),lat_range =c(-90, 90),title ="Global Fishing Hours",subtitle ="0.5 degree resolution, based on AIS data, SAR data, MPA presence, and Random Forest predictions using environmental data")# Regional mapscaribbean_map <-create_region_map(data = prediction_data_env, world_map = world_map, lon_col ="lon_std", lat_col ="lat_std", lon_range =c(-100, -50), lat_range =c(0, 40), title ="Predicted Fishing Hours in the Caribbean", subtitle ="0.5 degree resolution")indian_european_map <-create_region_map(data = prediction_data_env, world_map = world_map, lon_col ="lon_std", lat_col ="lat_std", lon_range =c(-20, 80), lat_range =c(0, 70), title ="Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", subtitle ="0.5 degree resolution")asia_map <-create_region_map(data = prediction_data_env, world_map = world_map, lon_col ="lon_std", lat_col ="lat_std", lon_range =c(60, 180), lat_range =c(-20, 60), title ="Predicted Fishing Hours in Asia", subtitle ="0.5 degree resolution")# Print mapsprint(global_map)print(caribbean_map)print(indian_european_map)print(asia_map)```